import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime
import statistics
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
plt.style.use("default")
CBLUEBG = "\33[44m"
CEND = "\33[0m"
df_accident = pd.read_parquet("../data/accident_data.parquet")
df_accident.shape
(1048575, 34)
df_accident.tail(3)
| Accident_Index | 1st_Road_Class | 1st_Road_Number | 2nd_Road_Class | 2nd_Road_Number | Accident_Severity | Carriageway_Hazards | Date | Day_of_Week | Did_Police_Officer_Attend_Scene_of_Accident | Junction_Control | Junction_Detail | Latitude | Light_Conditions | Local_Authority_(District) | Local_Authority_(Highway) | Location_Easting_OSGR | Location_Northing_OSGR | Longitude | LSOA_of_Accident_Location | Number_of_Casualties | Number_of_Vehicles | Pedestrian_Crossing-Human_Control | Pedestrian_Crossing-Physical_Facilities | Police_Force | Road_Surface_Conditions | Road_Type | Special_Conditions_at_Site | Speed_limit | Time | Urban_or_Rural_Area | Weather_Conditions | Year | InScotland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1048572 | 201091NM01935 | A | 96.0 | Unclassified | 0.0 | Slight | None | 23/02/2010 | Tuesday | 1.0 | Give way or uncontrolled | T or staggered junction | 57.585044 | Daylight | Highland | Highland | 288730.0 | 856520.0 | -3.862727 | None | 1 | 3 | 0.0 | 0.0 | Northern | Frost or ice | Single carriageway | None | 30 | 09:38 | Rural | Fine no high winds | 2010 | Yes |
| 1048573 | 201091NM01964 | A | 9.0 | Unclassified | 0.0 | Serious | None | 23/02/2010 | Tuesday | 1.0 | Give way or uncontrolled | T or staggered junction | 57.214898 | Darkness - no lighting | Highland | Highland | 289940.0 | 815260.0 | -3.823997 | None | 1 | 2 | 0.0 | 0.0 | Northern | Wet or damp | Single carriageway | None | 60 | 18:25 | Rural | Fine no high winds | 2010 | Yes |
| 1048574 | 201091NM02142 | None | 0.0 | Unclassified | 0.0 | Serious | Other object on road | 28/02/2010 | Sunday | 1.0 | Give way or uncontrolled | T or staggered junction | 57.575210 | Daylight | Highland | Highland | 286730.0 | 855480.0 | -3.895673 | None | 1 | 1 | 0.0 | 0.0 | Northern | Wet or damp | Dual carriageway | None | 60 | 15:45 | Rural | Snowing no high winds | 2010 | Yes |
msno.bar(df_accident, figsize=(20, 7), fontsize=12);
Contents:
print(f"Unique Values in {CBLUEBG} 1st_Road_Class {CEND}: {df_accident['1st_Road_Class'].nunique()}, \n{df_accident['1st_Road_Class'].unique()}")
Unique Values in 1st_Road_Class : 5,
['A' 'B' 'C' None 'Motorway' 'A(M)']
#utility function
def plot_road_class(data, ax, title):
sns.histplot(x=data, bins=50, ax=ax).set(title=f"Road Class - {title}");
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
ax = ax.flatten()
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "A"]["1st_Road_Number"], ax[0], "A")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "B"]["1st_Road_Number"], ax[1], "B")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "C"]["1st_Road_Number"], ax[2], "C")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "Motorway"]["1st_Road_Number"], ax[3], "Motorway")
plot_road_class(df_accident[df_accident["1st_Road_Class"] == "A(M)"]["1st_Road_Number"], ax[4], "A(M)")
plot_road_class(df_accident[df_accident["1st_Road_Class"].isna()]["1st_Road_Number"], ax[5], "None")
plt.tight_layout()
1st_Road_Number feature should be dropped as the inputs may be wrongfig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
ax = ax.flatten()
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "A"]["2nd_Road_Number"], ax[0], "A")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "B"]["2nd_Road_Number"], ax[1], "B")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "C"]["2nd_Road_Number"], ax[2], "C")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "Motorway"]["2nd_Road_Number"], ax[3], "Motorway")
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "A(M)"]["2nd_Road_Number"], ax[4], "A(M)")
plot_road_class(df_accident[df_accident["2nd_Road_Class"].isna()]["2nd_Road_Number"], ax[5], "None")
plt.tight_layout()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
plot_road_class(df_accident[df_accident["2nd_Road_Class"] == "Unclassified"]["2nd_Road_Number"], ax, "Unclassified")
2nd_Road_Number feature should be dropped as the inputs may be wrongdf_accident[["1st_Road_Number", "2nd_Road_Number"]].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 1st_Road_Number | 1048573.0 | 1011.997007 | 1832.041637 | 0.0 | 0.0 | 125.0 | 706.0 | 9999.0 |
| 2nd_Road_Number | 1037772.0 | 387.000433 | 1316.672921 | 0.0 | 0.0 | 0.0 | 0.0 | 9999.0 |
print(f"Unique Values in {CBLUEBG} Carriageway_Hazards {CEND}: {df_accident['Carriageway_Hazards'].nunique()}, \n{df_accident['Carriageway_Hazards'].unique()}")
Unique Values in Carriageway_Hazards : 6,
['None' 'Other object on road' 'Pedestrian in carriageway - not injured'
'Vehicle load on road' 'Previous accident'
'Any animal in carriageway (except ridden horse)' None]
sns.histplot(y=df_accident["Carriageway_Hazards"], bins=50);
None or missing¶print(f"{CBLUEBG}Missing values{CEND}: {df_accident['Carriageway_Hazards'].isna().sum()}")
Missing values: 29
df_accident[df_accident["Carriageway_Hazards"] == "None"].head(3)
| Accident_Index | 1st_Road_Class | 1st_Road_Number | 2nd_Road_Class | 2nd_Road_Number | Accident_Severity | Carriageway_Hazards | Date | Day_of_Week | Did_Police_Officer_Attend_Scene_of_Accident | Junction_Control | Junction_Detail | Latitude | Light_Conditions | Local_Authority_(District) | Local_Authority_(Highway) | Location_Easting_OSGR | Location_Northing_OSGR | Longitude | LSOA_of_Accident_Location | Number_of_Casualties | Number_of_Vehicles | Pedestrian_Crossing-Human_Control | Pedestrian_Crossing-Physical_Facilities | Police_Force | Road_Surface_Conditions | Road_Type | Special_Conditions_at_Site | Speed_limit | Time | Urban_or_Rural_Area | Weather_Conditions | Year | InScotland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200501BS00001 | A | 3218.0 | None | 0.0 | Serious | None | 04/01/2005 | Tuesday | 1.0 | Data missing or out of range | Not at junction or within 20 metres | 51.489096 | None | Kensington and Chelsea | Kensington and Chelsea | 525680.0 | 178240.0 | -0.191170 | E01002849 | 1 | 1 | 0.0 | 1.0 | Metropolitan Police | Wet or damp | Single carriageway | None | 30 | 17:42 | Urban | Raining no high winds | 2005 | No |
| 1 | 200501BS00002 | B | 450.0 | C | 0.0 | Slight | None | 05/01/2005 | Wednesday | 1.0 | Auto traffic signal | Crossroads | 51.520075 | Darkness - lights lit | Kensington and Chelsea | Kensington and Chelsea | 524170.0 | 181650.0 | -0.211708 | E01002909 | 1 | 1 | 0.0 | 5.0 | Metropolitan Police | Dry | Dual carriageway | None | 30 | 17:36 | Urban | Fine no high winds | 2005 | No |
| 2 | 200501BS00003 | C | 0.0 | None | 0.0 | Slight | None | 06/01/2005 | Thursday | 1.0 | Data missing or out of range | Not at junction or within 20 metres | 51.525301 | Darkness - lights lit | Kensington and Chelsea | Kensington and Chelsea | 524520.0 | 182240.0 | -0.206458 | E01002857 | 1 | 2 | 0.0 | 0.0 | Metropolitan Police | Dry | Single carriageway | None | 30 | 00:15 | Urban | Fine no high winds | 2005 | No |
df_accident[df_accident["Carriageway_Hazards"].isna()].head(3)
| Accident_Index | 1st_Road_Class | 1st_Road_Number | 2nd_Road_Class | 2nd_Road_Number | Accident_Severity | Carriageway_Hazards | Date | Day_of_Week | Did_Police_Officer_Attend_Scene_of_Accident | Junction_Control | Junction_Detail | Latitude | Light_Conditions | Local_Authority_(District) | Local_Authority_(Highway) | Location_Easting_OSGR | Location_Northing_OSGR | Longitude | LSOA_of_Accident_Location | Number_of_Casualties | Number_of_Vehicles | Pedestrian_Crossing-Human_Control | Pedestrian_Crossing-Physical_Facilities | Police_Force | Road_Surface_Conditions | Road_Type | Special_Conditions_at_Site | Speed_limit | Time | Urban_or_Rural_Area | Weather_Conditions | Year | InScotland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11984 | 200501MM79111 | B | 219.0 | Unclassified | 0.0 | Slight | None | 17/12/2005 | Saturday | 3.0 | Give way or uncontrolled | T or staggered junction | 51.449266 | Darkness - lights lit | Southwark | Southwark | 533990.0 | 174020.0 | -0.073146 | E01003958 | 1 | 2 | 0.0 | 0.0 | Metropolitan Police | None | Single carriageway | None | 30 | 01:00 | Urban | None | 2005 | No |
| 25234 | 200501YR99065 | None | 0.0 | None | 0.0 | Slight | None | 12/12/2005 | Monday | 3.0 | Data missing or out of range | Not at junction or within 20 metres | 51.595956 | Darkness - lights lit | Haringey | Haringey | 529240.0 | 190220.0 | -0.135526 | E01001966 | 2 | 2 | 0.0 | 0.0 | Metropolitan Police | None | Single carriageway | None | 30 | 19:00 | Urban | None | 2005 | No |
| 26704 | 200501ZT80452 | A | 237.0 | None | 0.0 | Slight | None | 14/10/2005 | Friday | 3.0 | Data missing or out of range | Not at junction or within 20 metres | 51.378278 | Darkness - lights lit | Sutton | Sutton | 528510.0 | 165980.0 | -0.154876 | E01004188 | 1 | 2 | 0.0 | 0.0 | Metropolitan Police | Dry | Single carriageway | None | 30 | 02:45 | Urban | Fine no high winds | 2005 | No |
None which means most of the time there were no carriageway hazards on the roadNone labelled values are interpreted by pandas as None values.👉 Filling the missing values (29) with mode (None)
statistics.mode(df_accident["Carriageway_Hazards"])
'None'
df = df_accident.copy()
df["Carriageway_Hazards"].fillna("None", inplace=True)
df["Carriageway_Hazards"].isna().sum()
0
print(f"Data type of Date Column: {df_accident['Date'].dtype}")
Data type of Date Column: object
df_accident["Date"].isna().sum()
0
df_accident["Date"][:7]
0 04/01/2005 1 05/01/2005 2 06/01/2005 3 07/01/2005 4 10/01/2005 5 11/01/2005 6 13/01/2005 Name: Date, dtype: object
Converting Date column from Object to Datetime
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")
print(f"Data type of Date Column: {df['Date'].dtype}")
Data type of Date Column: datetime64[ns]
df.shape
(1048575, 34)
Adding extra 2 features to the dataset (Day and Month)
df["Day"] = df["Date"].apply(lambda x:x.day)
df["Month"] = df["Date"].apply(lambda x:x.month)
df.drop("Date", axis=1, inplace=True)
df.shape
(1048575, 35)
Day with Accident_Severity
plt.figure(figsize=(20, 7))
sns.histplot(df_accident, x="Day", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=100);
Observation: Fatal data is too less to be seen in the graph
Different graph for Fatal case
plt.figure(figsize=(20, 7))
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"],
x="Day", palette="coolwarm", alpha=1.0, bins=75).set(title=f"Accident_Severity: Fatal");
Month with Accident_Severity
plt.figure(figsize=(20, 7))
sns.histplot(df_accident, x="Month", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=25);
Observation: Fatal data is too less to be seen in the graph
Different graph for Fatal case
plt.figure(figsize=(20, 7))
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"],
x="Month", palette="coolwarm", alpha=1.0, bins=25).set(title=f"Accident_Severity: Fatal");
df["Season"] = df["Month"].map({3: "Spring", 4: "Spring", 5: "Spring", 6: "Summer", 7: "Summer", 8: "Summer",
9: "Autumn", 10: "Autumn", 11: "Autumn", 12: "Winter", 1: "Winter", 2: "Winter"})
df.shape
(1048575, 36)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Season", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Season", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal");
df_accident["Accident_Severity"].isna().sum()
0
df_accident["Accident_Severity"].value_counts()
Slight 895883 Serious 138192 Fatal 14500 Name: Accident_Severity, dtype: int64
#percentage calculation
df_accident["Accident_Severity"].value_counts()/len(df_accident)*100
Slight 85.438142 Serious 13.179029 Fatal 1.382829 Name: Accident_Severity, dtype: float64
labels = ["Slight", "Serious", "Fatal"]
colors = sns.color_palette("pastel")
plt.pie(df_accident["Accident_Severity"].value_counts(), labels = labels, autopct="%.2f%%",
shadow=True, colors=colors, explode=[0.05, 0.05, 0.05]);
Fatal case data is only 1.38%. So the model will be biased for Slight cases.fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df_accident, x="Day_of_Week", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df_accident[df_accident["Accident_Severity"] == "Fatal"],
x="Day_of_Week", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal");
df["Time"].isna().sum()
100
Time feature from object to datetime¶Extract hour from Time data
df["Time"] = pd.to_datetime(df_accident["Time"], format="%H:%M")
df["Time"].dtype
dtype('<M8[ns]')
df["Hour"] = df["Time"].apply(lambda x:x.hour)
df.drop("Time", axis=1, inplace=True)
df.shape
(1048575, 36)
plt.figure(figsize=(20, 7))
sns.histplot(df, x="Hour", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=30);
print(f"Data type = {df['Junction_Control'].dtype}\nMissing Values = {df['Junction_Control'].isna().sum()}")
Data type = object Missing Values = 0
df["Junction_Control"].nunique(), df["Junction_Control"].unique()
(7,
array(['Data missing or out of range', 'Auto traffic signal',
'Give way or uncontrolled', 'Stop sign', 'Authorised person',
'Auto traffic sigl', 'Not at junction or within 20 metres'],
dtype=object))
Auto traffic signal and Auto traffic sigl may be same just misspelled
df["Junction_Control"].value_counts()
Give way or uncontrolled 502984 Data missing or out of range 353871 Auto traffic signal 105500 Not at junction or within 20 metres 76916 Stop sign 7128 Authorised person 1838 Auto traffic sigl 338 Name: Junction_Control, dtype: int64
df["Junction_Control"] = df["Junction_Control"].map({"Auto traffic sigl": "Auto traffic signal"}).fillna(df["Junction_Control"])
df["Junction_Control"].nunique()
6
df["Junction_Control"].unique()
array(['Data missing or out of range', 'Auto traffic signal',
'Give way or uncontrolled', 'Stop sign', 'Authorised person',
'Not at junction or within 20 metres'], dtype=object)
df["Junction_Control"].value_counts()
Give way or uncontrolled 502984 Data missing or out of range 353871 Auto traffic signal 105838 Not at junction or within 20 metres 76916 Stop sign 7128 Authorised person 1838 Name: Junction_Control, dtype: int64
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Junction_Control", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Junction_Control", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
Data missing or out of range in this feature. df.isna().sum() didn't show these as missing values but these are actually missing values. So they are also needed to be treated as others missing valueprint(f"Data type = {df['Junction_Detail'].dtype}\nMissing Values = {df['Junction_Detail'].isna().sum()}")
Data type = object Missing Values = 0
df["Junction_Detail"].nunique(), df["Junction_Detail"].unique()
(10,
array(['Not at junction or within 20 metres', 'Crossroads',
'T or staggered junction', 'Mini-roundabout', 'Other junction',
'More than 4 arms (not roundabout)', 'Private drive or entrance',
'Roundabout', 'Slip road', 'Data missing or out of range'],
dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Junction_Detail", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Junction_Detail", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
Not at junction or within 20 metres which is contradictory with Junction_Control feature.Data missing or out of range in this feature. df.isna().sum() didn't show these as missing values but these are actually missing values. So they are also needed to be treated as others missing valuedf[df["Junction_Detail"] == "Not at junction or within 20 metres"]["Junction_Control"][:5]
0 Data missing or out of range 2 Data missing or out of range 3 Data missing or out of range 4 Data missing or out of range 5 Data missing or out of range Name: Junction_Control, dtype: object
print(f"Data type = {df['Light_Conditions'].dtype}\nMissing Values = {df['Light_Conditions'].isna().sum()}")
Data type = object Missing Values = 2084
df["Light_Conditions"].nunique(), df["Light_Conditions"].unique()
(5,
array([None, 'Darkness - lights lit', 'Darkness - lighting unknown',
'Darkness - lights unlit', 'Darkness - no lighting', 'Daylight'],
dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Light_Conditions", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Light_Conditions", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
Darkness - lights unlit can be considered as Darkness - no lighting for simplicityDaylight. The possibilities can beprint(f"Data type = {df['Weather_Conditions'].dtype}\nMissing Values = {df['Weather_Conditions'].isna().sum()}")
Data type = object Missing Values = 21392
df["Weather_Conditions"].nunique(), df["Weather_Conditions"].unique()
(8,
array(['Raining no high winds', 'Fine no high winds', None,
'Snowing no high winds', 'Other', 'Fine + high winds',
'Raining + high winds', 'Fog or mist', 'Snowing + high winds'],
dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Weather_Conditions", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Weather_Conditions", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
Fine no high winds.print(f"Data type = {df['Number_of_Casualties'].dtype}\nMissing Values = {df['Number_of_Casualties'].isna().sum()}")
Data type = int64 Missing Values = 0
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Number_of_Casualties", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Number_of_Casualties", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
plt.tight_layout();
fig, ax = plt.subplots(ncols = 2, figsize=(20, 5))
sns.boxplot(x=df["Number_of_Casualties"], ax=ax[0])
sns.boxplot(x=df["Number_of_Casualties"], ax=ax[1], showfliers=False)
plt.tight_layout();
print(f"Data type = {df['Number_of_Vehicles'].dtype}\nMissing Values = {df['Number_of_Vehicles'].isna().sum()}")
Data type = int64 Missing Values = 0
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Number_of_Vehicles", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Number_of_Vehicles", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
plt.tight_layout();
fig, ax = plt.subplots(ncols = 2, figsize=(20, 5))
sns.boxplot(x=df["Number_of_Vehicles"], ax=ax[0])
sns.boxplot(x=df["Number_of_Vehicles"], ax=ax[1], showfliers=False)
plt.tight_layout();
print(f"Data type = {df['Road_Type'].dtype}\nMissing Values = {df['Road_Type'].isna().sum()}")
Data type = object Missing Values = 7266
df["Road_Type"].nunique(), df["Road_Type"].unique()
(5,
array(['Single carriageway', 'Dual carriageway', 'One way street',
'Roundabout', 'Slip road', None], dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Road_Type", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Road_Type", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
df[df["Road_Type"] == "Dual carriageway"]["1st_Road_Class"][:5]
1 B 7 A 14 A 22 A 24 A Name: 1st_Road_Class, dtype: object
print(f"Data type = {df['Road_Surface_Conditions'].dtype}\nMissing Values = {df['Road_Surface_Conditions'].isna().sum()}")
Data type = object Missing Values = 1189
df["Road_Surface_Conditions"].nunique(), df["Road_Surface_Conditions"].unique()
(5,
array(['Wet or damp', 'Dry', 'Frost or ice', 'Snow',
'Flood over 3cm. deep', None], dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Road_Surface_Conditions", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Road_Surface_Conditions", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
print(f"Data type = {df['Special_Conditions_at_Site'].dtype}\nMissing Values = {df['Special_Conditions_at_Site'].isna().sum()}")
Data type = object Missing Values = 16
df["Special_Conditions_at_Site"].nunique(), df["Special_Conditions_at_Site"].unique()
(10,
array(['None', 'Oil or diesel', 'Roadworks', 'Auto signal part defective',
'Road surface defective', 'Auto traffic signal - out',
'Road sign or marking defective or obscured', 'Mud',
'Auto traffic sigl - out', None, 'Auto sigl part defective'],
dtype=object))
df["Special_Conditions_at_Site"].value_counts()[5:]
Road sign or marking defective or obscured 1636 Auto traffic signal - out 1574 Auto traffic sigl - out 455 Auto signal part defective 444 Auto sigl part defective 134 Name: Special_Conditions_at_Site, dtype: int64
sigl and signalsite_condition_di = {"Auto traffic sigl - out": "Auto traffic signal - out",
"Auto sigl part defective": "Auto signal part defective"}
df["Special_Conditions_at_Site"] = df["Special_Conditions_at_Site"].map(site_condition_di).fillna(df["Special_Conditions_at_Site"])
df["Special_Conditions_at_Site"].nunique()
8
df["Special_Conditions_at_Site"].value_counts()
None 1021535 Roadworks 12873 Oil or diesel 3998 Mud 3458 Road surface defective 2452 Auto traffic signal - out 2029 Road sign or marking defective or obscured 1636 Auto signal part defective 578 Name: Special_Conditions_at_Site, dtype: int64
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 7))
sns.histplot(df, x="Special_Conditions_at_Site", hue="Accident_Severity", palette="coolwarm", alpha=1.0, bins=50, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Special_Conditions_at_Site", palette="tab10", alpha=1.0, bins=50, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
ax[0].tick_params(axis="x", rotation=45)
ax[1].tick_params(axis="x", rotation=45)
plt.tight_layout();
print(f"Data type = {df['Urban_or_Rural_Area'].dtype}\nMissing Values = {df['Urban_or_Rural_Area'].isna().sum()}")
Data type = object Missing Values = 85
df["Urban_or_Rural_Area"].nunique(), df["Urban_or_Rural_Area"].unique()
(3, array(['Urban', 'Rural', None, 'Unallocated'], dtype=object))
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
sns.histplot(df, x="Urban_or_Rural_Area", hue="Accident_Severity", palette="coolwarm", alpha=1.0, ax=ax[0])
sns.histplot(df[df["Accident_Severity"] == "Fatal"],
x="Urban_or_Rural_Area", palette="tab10", alpha=1.0, ax=ax[1]).set(title=f"Accident_Severity: Fatal")
plt.tight_layout();
labels = ["Urban", "Rural", "Unallocated"]
colors = sns.color_palette("pastel")
plt.pie(df["Urban_or_Rural_Area"].value_counts(), labels = labels, autopct="%.2f%%",
shadow=True, colors=colors, explode=[0.05, 0.05, 0.05]);
print(f"Data type = {df['Police_Force'].dtype}\nMissing Values = {df['Police_Force'].isna().sum()}")
Data type = object Missing Values = 0
df["Police_Force"].nunique(), df["Police_Force"].unique()
(51,
array(['Metropolitan Police', 'City of London', 'Cumbria', 'Lancashire',
'Merseyside', 'Greater Manchester', 'Cheshire', 'Northumbria',
'Durham', 'North Yorkshire', 'West Yorkshire', 'South Yorkshire',
'Humberside', 'Cleveland', 'West Midlands', 'Staffordshire',
'West Mercia', 'Warwickshire', 'Derbyshire', 'Nottinghamshire',
'Lincolnshire', 'Leicestershire', 'Northamptonshire',
'Cambridgeshire', 'Norfolk', 'Suffolk', 'Bedfordshire',
'Hertfordshire', 'Essex', 'Thames Valley', 'Hampshire', 'Surrey',
'Kent', 'Sussex', 'Devon and Cornwall', 'Avon and Somerset',
'Gloucestershire', 'Wiltshire', 'Dorset', 'North Wales', 'Gwent',
'South Wales', 'Dyfed-Powys', 'Northern', 'Grampian', 'Tayside',
'Fife', 'Lothian and Borders', 'Central', 'Strathclyde',
'Dumfries and Galloway'], dtype=object))
plt.figure(figsize=(20, 7))
sns.histplot(df, x="Police_Force", hue="Accident_Severity", palette="coolwarm", alpha=1.0).tick_params(axis="x", rotation=75);
Drop:
1st_Road_Number and 2nd_Road_NumberAccident_Index column is just for reference, no need for model developmentDid_Police_Officer_Attend_Scene_of_Accident column is an occurence after the incident/accidentLatitude and Longitude. Only Latitude or Longitude can't locate a place. Same Latitude can have multiple Longitude. As there is no way to combine them (I couldn't find), it's better to drop themdf_preprocessed = df.copy()
cols_to_drop = ["Accident_Index", "1st_Road_Number", "2nd_Road_Number", "Did_Police_Officer_Attend_Scene_of_Accident",
"Latitude", "Longitude", "Location_Easting_OSGR", "Location_Northing_OSGR"]
df_preprocessed.drop(columns=cols_to_drop, inplace=True)
df_preprocessed.shape
(1048575, 28)
for label, content in df_preprocessed.items():
if pd.api.types.is_numeric_dtype(content):
if pd.isnull(content).sum():
print(label)
Pedestrian_Crossing-Human_Control Pedestrian_Crossing-Physical_Facilities Hour
df_preprocessed.loc[:, ["Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", "Hour"]].isna().sum()
Pedestrian_Crossing-Human_Control 21 Pedestrian_Crossing-Physical_Facilities 37 Hour 100 dtype: int64
df_preprocessed.dropna(subset=["Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", "Hour"],
inplace=True)
Convert Object to Category
df_preprocessed.dtypes
1st_Road_Class object 2nd_Road_Class object Accident_Severity object Carriageway_Hazards object Day_of_Week object Junction_Control object Junction_Detail object Light_Conditions object Local_Authority_(District) object Local_Authority_(Highway) object LSOA_of_Accident_Location object Number_of_Casualties int64 Number_of_Vehicles int64 Pedestrian_Crossing-Human_Control float64 Pedestrian_Crossing-Physical_Facilities float64 Police_Force object Road_Surface_Conditions object Road_Type object Special_Conditions_at_Site object Speed_limit int64 Urban_or_Rural_Area object Weather_Conditions object Year int64 InScotland object Day int64 Month int64 Season object Hour float64 dtype: object
for label, content in df_preprocessed.items():
if pd.api.types.is_string_dtype(content):
df_preprocessed[label] = content.astype("category").cat.as_unordered()
df_preprocessed.dtypes
1st_Road_Class category 2nd_Road_Class category Accident_Severity category Carriageway_Hazards category Day_of_Week category Junction_Control category Junction_Detail category Light_Conditions category Local_Authority_(District) category Local_Authority_(Highway) category LSOA_of_Accident_Location category Number_of_Casualties int64 Number_of_Vehicles int64 Pedestrian_Crossing-Human_Control float64 Pedestrian_Crossing-Physical_Facilities float64 Police_Force category Road_Surface_Conditions category Road_Type category Special_Conditions_at_Site category Speed_limit int64 Urban_or_Rural_Area category Weather_Conditions category Year int64 InScotland category Day int64 Month int64 Season category Hour float64 dtype: object
df_preprocessed["Light_Conditions"].cat.categories
Index(['Darkness - lighting unknown', 'Darkness - lights lit',
'Darkness - lights unlit', 'Darkness - no lighting', 'Daylight'],
dtype='object')
# missing values filled with -1
df_preprocessed["Light_Conditions"].cat.codes.unique()
array([-1, 1, 0, 2, 3, 4], dtype=int8)
#Turn categorical variables into numbers and fill missing
#If pandas category has missing values, it encoded -1. To make it 0 we add 1.
for label, content in df_preprocessed.items():
if not pd.api.types.is_numeric_dtype(content):
# Turn categories into numbers and add 1
df_preprocessed[label] = pd.Categorical(df_preprocessed[label]).codes + 1
df_preprocessed.isna().sum().sum()
0
%%time
plt.figure(figsize=(30, 20))
sns.heatmap(data=df_preprocessed.corr(), annot=True, fmt=".2f", cmap="viridis")
plt.tight_layout()
Wall time: 2.97 s
sns.clustermap(data=df_preprocessed.corr(), annot=True, fmt=".2f", cmap="coolwarm", figsize=(30, 30))
plt.tight_layout()
plt.figure(figsize=(10, 5))
sns.boxplot(y = df["Day_of_Week"], x = df["Number_of_Vehicles"]);
plt.figure(figsize=(10, 5))
sns.boxplot(y = df["Day_of_Week"], x = df["Number_of_Casualties"]);
g = sns.relplot(x="Number_of_Vehicles", y="Day_of_Week", data=df,
palette="bright", height=3, aspect=1.3,
kind="scatter", hue="Accident_Severity", col="Light_Conditions", col_wrap=3)
sns.move_legend(g, "lower right")
plt.tight_layout();
<Figure size 2000x1000 with 0 Axes>
g = sns.relplot(x="Number_of_Vehicles", y="Road_Surface_Conditions", data=df,
palette="bright", height=3, aspect=2,
kind="scatter", hue="Accident_Severity", col="Weather_Conditions", col_wrap=3)
sns.move_legend(g, "lower right")
plt.tight_layout();
g = sns.relplot(x="1st_Road_Class", y="2nd_Road_Class", data=df[df["Accident_Severity"] == "Fatal"],
palette="bright", height=5, aspect=2,
kind="scatter", hue="Road_Type", col="Speed_limit", col_wrap=3)
sns.move_legend(g, "lower right")
plt.tight_layout();
g = sns.FacetGrid(df[df["Number_of_Vehicles"]<=3], col="Season", row="Speed_limit", hue="Accident_Severity",
height=4, aspect=2, margin_titles=False).map(sns.distplot, "Number_of_Vehicles")
g.add_legend()
# g.set(xlim=(0, None))
g.set(xlim=(0, 3))
plt.tight_layout();
df = df_accident.copy()
g = sns.FacetGrid(df.loc[(df["Number_of_Vehicles"]<=3) & (df["Accident_Severity"] == "Fatal")], col="Season", row="Speed_limit", hue="Urban_or_Rural_Area",
height=4, aspect=2, margin_titles=False).map(sns.distplot, "Number_of_Vehicles")
g.add_legend()
# g.set(xlim=(0, None))
g.set(xlim=(0, 3))
plt.tight_layout();